1.a) Does terrain characteristics (eg. exposition, slope) correlate with land cover, ie. are there major differences between single land covers?
library(terra)
## terra 1.7.78
library(raster)
## Warning: package 'raster' was built under R version 4.4.1
## Cargando paquete requerido: sp
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.1
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(caret)
## Warning: package 'caret' was built under R version 4.4.1
## Cargando paquete requerido: ggplot2
##
## Adjuntando el paquete: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## Cargando paquete requerido: lattice
library(data.table)
##
## Adjuntando el paquete: 'data.table'
## The following object is masked from 'package:raster':
##
## shift
## The following object is masked from 'package:terra':
##
## shift
library(sf)
## Warning: package 'sf' was built under R version 4.4.1
## Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE
library(dplyr)
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:raster':
##
## intersect, select, union
## The following objects are masked from 'package:terra':
##
## intersect, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(vcd)
## Warning: package 'vcd' was built under R version 4.4.1
## Cargando paquete requerido: grid
##
## Adjuntando el paquete: 'grid'
## The following object is masked from 'package:terra':
##
## depth
##
## Adjuntando el paquete: 'vcd'
## The following object is masked from 'package:raster':
##
## mosaic
## The following objects are masked from 'package:terra':
##
## mosaic, sieve
library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 4.4.1
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.1
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
Sentinel raster with bands 2 (blue), 3 (green), 4 (red) & 8 (NIR).
rast1 <- rast("WVP_10m.jp2")
rast2 <- rast("AOT_10m.jp2")
rast3 <- rast("B02_10m.jp2")
rast4 <- rast("B03_10m.jp2")
rast5 <- rast("B04_10m.jp2")
rast6 <- rast("B08_10m.jp2")
rast7 <- rast("TCI_10m.jp2")
rast_fogo <- c(rast3,rast4,rast5,rast6)
plot(rast_fogo)
Crop & Stack
# extension
crop_extent <- ext(c(765000,795000,1638000,1667000)) #xmin, xmax, ymin, ymax
# crop stack
fogo_crop <- crop(rast_fogo, crop_extent)
plot(fogo_crop$B08_10m)
writeRaster(fogo_crop, "fogo_crop.tif", overwrite=TRUE)
True color composition
# true
plotRGB(fogo_crop, b =1,
g =2,
r = 3,
stretch="lin")
# false
plotRGB(fogo_crop, b =1,
g =2,
r = 4, # NRI
stretch="lin")
Calculate NDVI ndvi = (NIR - RED)/(NIR + RED)
ndvi = (fogo_crop[[4]] - fogo_crop[[3]]) / (fogo_crop[[4]] + fogo_crop[[3]])
plot(ndvi)
Other method: function for NDVI
NDVI <- function(nir, red){
if (length(nir) != length(red)){
stop("NIR and RED dont match")
}
ndvi <- (nir-red) / (nir+red)
names(ndvi)<- "ndvi"
ndvi
}
# Apply the function
ndvi_fog <-NDVI(fogo_crop$B08_10m, fogo_crop$B04_10m)
plot(ndvi_fog)
Unir NDVI con el stack
result <- c(fogo_crop, ndvi)
names(result) <- c("B02_may", "B03_may","B04_may", "B08_may","NDVI_may")
writeRaster(result, "fogo_ndvi2.tif", overwrite=TRUE, datatype = "FLT4S" )
sentinel_nov <- rast("sentinel2_20221127_ndvi.tif")
sentinel_may <- rast("fogo_ndvi2.tif")
# combine
sentinel <- c(sentinel_may, sentinel_nov)
# polygons
ref_data <- vect("reference1.gpkg")
table(ref_data$class)
##
## Agriculture Basaltic rock (old)
## 20 19
## Basaltic rock (young) Forest
## 17 15
## Human settlements Land with little or no vegetation
## 16 37
## Ocean
## 16
we verify if it’s the same place in may and nov
# may
plotRGB(sentinel, b =1,
g =2,
r = 3,
stretch="lin")
# nov
plotRGB(sentinel, b =6,
g =7,
r = 8,
stretch="lin")
plot(ref_data, add=TRUE, col ="red")
Random forest
df_extract <- extract(sentinel, ref_data, na.rm = TRUE)
ref_data$ID <- seq(1, nrow( ref_data))
df_extract <- merge(df_extract, ref_data)
# create model
rfmodel = randomForest(x = df_extract[,c("B02_may", "B03_may", "B04_may", "B08_may", "NDVI_may",
"B02_nov", "B03_nov", "B04_nov", "B08_nov", "NDVI_nov")],
y = as.factor( df_extract[, c("class")] ), ntree = 100)
Criteria for classification:
# apply model
lcc <- predict(sentinel, rfmodel)
plot(lcc, col=c("orange", "brown", "black", "darkgreen","purple","yellowgreen","darkblue"))
# Create a partition (70% training data, 30% test data)
trainIndex <- createDataPartition(df_extract$class, p = 0.7, list = FALSE)
# Split data into training and testing sets
trainData <- df_extract[trainIndex, ] #data que si esta viendo el modelo
testData <- df_extract[-trainIndex, ] #data que no esta viendo el modelo
rfmodel2 = randomForest(x = trainData[,c("B02_may", "B03_may", "B04_may", "B08_may", "NDVI_may",
"B02_nov", "B03_nov", "B04_nov", "B08_nov", "NDVI_nov")],
y = as.factor( trainData[, c("class")] ),
ntree = 100)
Apply model with traindata and testdata
lcc2 = predict(sentinel, rfmodel2)
## |---------|---------|---------|---------|=========================================
plot(lcc2, col=c("orange", "brown", "black", "darkgreen","purple","yellowgreen","darkblue"))
writeRaster(lcc2, "fogo_model2.tif", overwrite = TRUE)
Accuracy with test data
# predict the classes of the independent test set
test_prediction = predict(rfmodel2, testData)
# confusion matrix
cfm = table(testData$class, test_prediction)
cfm
## test_prediction
## Agriculture Basaltic rock (old)
## Agriculture 1072 3
## Basaltic rock (old) 7 3586
## Basaltic rock (young) 1 106
## Forest 3 0
## Human settlements 10 302
## Land with little or no vegetation 133 203
## Ocean 0 0
## test_prediction
## Basaltic rock (young) Forest
## Agriculture 6 3
## Basaltic rock (old) 51 0
## Basaltic rock (young) 5952 0
## Forest 0 2168
## Human settlements 6 0
## Land with little or no vegetation 1 9
## Ocean 1 0
## test_prediction
## Human settlements
## Agriculture 5
## Basaltic rock (old) 283
## Basaltic rock (young) 1
## Forest 0
## Human settlements 3952
## Land with little or no vegetation 100
## Ocean 0
## test_prediction
## Land with little or no vegetation Ocean
## Agriculture 292 0
## Basaltic rock (old) 244 2
## Basaltic rock (young) 2 1
## Forest 13 0
## Human settlements 223 0
## Land with little or no vegetation 5227 0
## Ocean 0 19701
# accuracy: sum of diagonals divided by all
sum(diag(cfm))/sum(cfm)
## [1] 0.953949
Cross validation
# set up cross validation with 5 folds
trc = trainControl(method = "cv", number = 5)
# tune model mtry
rfmodel_cv_may = caret::train(x = trainData[,c("B02_may", "B03_may", "B04_may", "B08_may", "NDVI_may")],
y = trainData[, c("class")],
method = "rf",
trControl = trc,
ntree = 100,
tuneLength = 3)
rfmodel_cv_nov = caret::train(x = trainData[,c("B02_nov", "B03_nov", "B04_nov", "B08_nov", "NDVI_nov")],
y = trainData[, c("class")],
method = "rf",
trControl = trc,
ntree = 100,
tuneLength = 3)
rfmodel_cv_full = caret::train(x = trainData[,c("B02_may", "B03_may", "B04_may", "B08_may", "NDVI_may",
"B02_nov", "B03_nov", "B04_nov", "B08_nov", "NDVI_nov")],
y = trainData[, c("class")],
method = "rf",
trControl = trc,
ntree = 100,
tuneLength = 3)
# predict the classes of the independet test set
test_prediction <- predict(rfmodel_cv_full, testData)
# confusion matrix
confusionMatrix(as.factor(testData$class), test_prediction, mode="everything")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Agriculture Basaltic rock (old)
## Agriculture 1058 3
## Basaltic rock (old) 7 3576
## Basaltic rock (young) 3 114
## Forest 4 0
## Human settlements 9 305
## Land with little or no vegetation 143 206
## Ocean 0 0
## Reference
## Prediction Basaltic rock (young) Forest
## Agriculture 7 2
## Basaltic rock (old) 49 0
## Basaltic rock (young) 5941 0
## Forest 0 2166
## Human settlements 6 0
## Land with little or no vegetation 1 15
## Ocean 1 0
## Reference
## Prediction Human settlements
## Agriculture 5
## Basaltic rock (old) 283
## Basaltic rock (young) 2
## Forest 0
## Human settlements 3949
## Land with little or no vegetation 91
## Ocean 0
## Reference
## Prediction Land with little or no vegetation Ocean
## Agriculture 306 0
## Basaltic rock (old) 256 2
## Basaltic rock (young) 2 1
## Forest 14 0
## Human settlements 224 0
## Land with little or no vegetation 5217 0
## Ocean 0 19701
##
## Overall Statistics
##
## Accuracy : 0.9528
## 95% CI : (0.9508, 0.9548)
## No Information Rate : 0.4512
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9359
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Agriculture Class: Basaltic rock (old)
## Sensitivity 0.86438 0.85062
## Specificity 0.99239 0.98487
## Pos Pred Value 0.76611 0.85694
## Neg Pred Value 0.99607 0.98410
## Precision 0.76611 0.85694
## Recall 0.86438 0.85062
## F1 0.81228 0.85377
## Prevalence 0.02803 0.09627
## Detection Rate 0.02423 0.08189
## Detection Prevalence 0.03162 0.09556
## Balanced Accuracy 0.92838 0.91775
## Class: Basaltic rock (young) Class: Forest
## Sensitivity 0.9893 0.99221
## Specificity 0.9968 0.99957
## Pos Pred Value 0.9799 0.99176
## Neg Pred Value 0.9983 0.99959
## Precision 0.9799 0.99176
## Recall 0.9893 0.99221
## F1 0.9846 0.99199
## Prevalence 0.1375 0.04999
## Detection Rate 0.1360 0.04960
## Detection Prevalence 0.1388 0.05001
## Balanced Accuracy 0.9931 0.99589
## Class: Human settlements
## Sensitivity 0.91201
## Specificity 0.98617
## Pos Pred Value 0.87892
## Neg Pred Value 0.99027
## Precision 0.87892
## Recall 0.91201
## F1 0.89516
## Prevalence 0.09916
## Detection Rate 0.09043
## Detection Prevalence 0.10289
## Balanced Accuracy 0.94909
## Class: Land with little or no vegetation Class: Ocean
## Sensitivity 0.8668 0.9998
## Specificity 0.9879 1.0000
## Pos Pred Value 0.9196 0.9999
## Neg Pred Value 0.9789 0.9999
## Precision 0.9196 0.9999
## Recall 0.8668 0.9998
## F1 0.8924 0.9999
## Prevalence 0.1378 0.4512
## Detection Rate 0.1195 0.4511
## Detection Prevalence 0.1299 0.4512
## Balanced Accuracy 0.9273 0.9999
Definition: The proportion of correctly classified instances among the total instances. Value: 0.9531 (95.31%) Interpretation: The model correctly classified 95.31% of all instances.
Definition: The range within which the true accuracy is expected to fall, with 95% confidence. Value: (0.9511, 0.9551) Interpretation: We are 95% confident that the true accuracy of the model lies between 95.11% and 95.51%.
Definition: The p-value testing whether the model’s accuracy is significantly better than the No Information Rate (NIR), which is the accuracy expected by chance. Value: < 2.2e-16 Interpretation: The model’s accuracy is highly significantly better than random chance, with a p-value much smaller than 0.05.
Definition: A measure of the agreement between predicted and actual classifications, adjusting for chance agreement. Value: 0.9364 Interpretation: There is a very high level of agreement between the predicted and actual classes, much better than random chance.
Definition: The proportion of correctly predicted positive instances among all predicted positives. Interpretation: Precision is calculated per class, for example: For “bare_ground”: 0.9117 (91.17%) means that when the model predicts “bare_ground,” it is correct 91.17% of the time. Precision values vary by class, reflecting the model’s reliability for each class prediction.
Definition: The proportion of correctly predicted positives among all actual positives. Interpretation: Recall is also calculated per class: For “bare_ground”: 0.8734 (87.34%) means that the model correctly identifies 87.34% of all actual “bare_ground” instances. Higher recall means fewer actual positives are missed.
Definition: The harmonic mean of precision and recall, providing a balanced measure that accounts for both false positives and false negatives. Interpretation: A high F1 score indicates a good balance between precision and recall.
# data
elevation<- rast("elevation_fogo.tif")
aspect <-rast("aspect_fogo.tif")
slope <- rast("slope_zona26.tif")
Extract values from the rasters
# Elevation
lcc2_resampled <- resample(lcc2, elevation, method = "near")
values_df <- as.data.frame(c(lcc2_resampled, elevation), na.rm = TRUE)
colnames(values_df) <- c("land_cover", "elevation")
values_df$land_cover <- as.factor(values_df$land_cover)
values_df$land_cover_num <- as.numeric(as.factor(values_df$land_cover))
# Aspect
lcc3_resampled <- resample(lcc2, aspect, method = "near")
values_df2 <- as.data.frame(c(lcc3_resampled, aspect), na.rm = TRUE)
colnames(values_df2) <- c("land_cover", "aspect")
values_df2$land_cover <- as.factor(values_df2$land_cover)
values_df2$land_cover_num <- as.numeric(as.factor(values_df2$land_cover))
# Slope
lcc4_resampled <- resample(lcc2, slope, method = "near")
values_df3 <- as.data.frame(c(lcc4_resampled, slope), na.rm = TRUE)
colnames(values_df3) <- c("land_cover", "slope")
values_df3$land_cover <- as.factor(values_df3$land_cover)
values_df3$land_cover_num <- as.numeric(as.factor(values_df3$land_cover))
# Elevation
hist(values_df$elevation, main="Elevation", xlab="elevation")
# Aspect
hist(values_df2$aspect, main="Aspect", xlab="Aspect")
# Elevation
hist(values_df3$slope, main="Slope", xlab="Slope")
Because of the lack of normality in elevation and slope, we use a
non-parametric method: Kruskal-Wallis, which is adequate to test a
categorical variable with a continuous.It evaluates the median of the
slope/elevation within 2 or more groups of land cover. it’s the
non-parametric alternative for ANOVA.
For Aspect, because we have 2 categorical variables, we will use contingency tables, Cramer’s and Fisher’s test.
# Kruskal test
kruskal_elevation <- kruskal.test(elevation ~ land_cover, data = values_df)
kruskal_elevation
##
## Kruskal-Wallis rank sum test
##
## data: elevation by land_cover
## Kruskal-Wallis chi-squared = 154.99, df = 6, p-value < 2.2e-16
P-value < 0.05, elevation can differ with the type of land cover. But to see which of the land use may be the one that has the strongest relation we can visualize it on a box-plot.
boxplot(elevation ~ land_cover, data = values_df,
xlab = "Land Cover",
ylab = "Elevation",
main = "Elevation & Land Cover",
col = "lightblue",
las = 2)
Ocean and basaltic rock (old) may be the land cover types with the
highest relation with elevation because of its small variability.
# names to aspect
values_df2 <- values_df2 %>%
mutate(aspect2 = case_when(
aspect == 2 ~ "N",
aspect == 3 ~ "NE",
aspect == 4 ~ "E",
aspect == 5 ~ "SE",
aspect == 6 ~ "S",
aspect == 7 ~ "SW",
aspect == 8 ~ "W",
aspect == 9 ~ "NW",
aspect == 10 ~ "N",
TRUE ~ as.character(aspect)
))
# contingency table
table_cont<- table(values_df2$land_cover, values_df2$aspect2)
filtered_table <- table_cont[rowSums(table_cont) > 0, colSums(table_cont) > 0]
# Correspondence analysis
ac <- CA(filtered_table, graph = FALSE)
summary(ac)
##
## Call:
## CA(X = filtered_table, graph = FALSE)
##
## The chi square of independence between the two variables is equal to 235.7635 (p-value = 1.305044e-31 ).
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Variance 0.298 0.163 0.063 0.034 0.017
## % of var. 51.783 28.325 10.936 5.970 2.987
## Cumulative % of var. 51.783 80.108 91.044 97.013 100.000
##
## Rows
## Iner*1000 Dim.1 ctr cos2
## Agriculture | 105.657 | 0.428 5.708 0.161 |
## Basaltic rock (old) | 113.926 | 0.387 6.629 0.173 |
## Basaltic rock (young) | 96.980 | 0.802 26.329 0.808 |
## Forest | 117.769 | 1.560 25.910 0.655 |
## Human settlements | 35.385 | 0.465 1.418 0.119 |
## Land with little or no vegetation | 105.316 | -0.410 34.007 0.962 |
## Dim.2 ctr cos2 Dim.3 ctr
## Agriculture -0.836 39.742 0.613 | 0.488 35.077
## Basaltic rock (old) 0.812 53.312 0.762 | 0.166 5.760
## Basaltic rock (young) -0.007 0.003 0.000 | -0.064 0.800
## Forest -0.438 3.729 0.052 | -0.905 41.310
## Human settlements 0.469 2.630 0.121 | 0.628 12.225
## Land with little or no vegetation -0.040 0.584 0.009 | -0.071 4.829
## cos2
## Agriculture 0.209 |
## Basaltic rock (old) 0.032 |
## Basaltic rock (young) 0.005 |
## Forest 0.221 |
## Human settlements 0.217 |
## Land with little or no vegetation 0.029 |
##
## Columns
## Iner*1000 Dim.1 ctr cos2
## E | 84.466 | 0.711 12.025 0.424 |
## N | 54.292 | 0.672 10.734 0.589 |
## NE | 131.165 | 1.341 38.272 0.869 |
## NW | 62.316 | 0.145 0.811 0.039 |
## S | 61.464 | -0.332 7.386 0.358 |
## SE | 41.844 | 0.273 1.893 0.135 |
## SW | 96.902 | -0.597 28.630 0.880 |
## W | 42.585 | -0.067 0.249 0.017 |
## Dim.2 ctr cos2 Dim.3 ctr
## E 0.611 16.224 0.313 | 0.485 26.431
## N -0.265 3.041 0.091 | -0.351 13.844
## NE -0.075 0.220 0.003 | -0.473 22.527
## NW -0.604 25.704 0.672 | 0.341 21.186
## S 0.394 19.016 0.504 | -0.063 1.274
## SE 0.641 19.099 0.743 | 0.118 1.660
## SW -0.047 0.320 0.005 | -0.151 8.703
## W -0.401 16.375 0.626 | 0.129 4.375
## cos2
## E 0.197 |
## N 0.160 |
## NE 0.108 |
## NW 0.214 |
## S 0.013 |
## SE 0.025 |
## SW 0.056 |
## W 0.065 |
# plot correspondence analysis
fviz_ca_biplot(ac, repel = TRUE)
p-value <0.05 = significant relation between the 2 variables
DIM 1:
On the right side of Dim.1: Categories such as “Basaltic rock (young)”, “Forest”, and columns like “NE” are found, suggesting an association of these areas with more natural and geologically young environments.
On the left side of Dim.1: Categories like “Land with little or no vegetation” are grouped, which is associated with arid or sparsely vegetated terrains, along with directions like “W” and “SW.”
DIM 2:
Above Dim.2: “Human settlements” and “Basaltic rock (old)” are located.
Below Dim.2: “Agriculture” is found, implying that this dimension also distinguishes predominantly agricultural areas from other categories. “NW” and “S” are near “Agriculture,” which could indicate some relationship or geographical proximity to cultivated areas.
Dim.1 seems to represent a gradient of vegetation, ranging from arid or sparsely vegetated areas to densely forested areas or young basaltic rocks.
Dim.2 captures a gradient of human intervention and geological age.
# Kruskal test
kruskal_slope <- kruskal.test(slope ~ land_cover, data = values_df3)
kruskal_slope
##
## Kruskal-Wallis rank sum test
##
## data: slope by land_cover
## Kruskal-Wallis chi-squared = 51.688, df = 6, p-value = 2.154e-09
P-value < 0.05, slope can differ with the type of land cover. But to see which of the land use may be the one that has the strongest relation we can visualize it on a box-plot.
boxplot(slope ~ land_cover, data = values_df3,
xlab = "Land Cover",
ylab = "Slope",
main = "Slope & Land Cover",
col = "lightblue",
las = 2)
Forest may be the land cover type with the highest relation with slope because of its small variability.